import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os

os.chdir("C:/Users/agusv/Desktop/Estudio/Tesis/Csv")
data = pd.read_csv("multivariate_biolng.csv", parse_dates=['Date'])
data.set_index('Date', inplace=True)
biolng_prices = data['BioLNG Price']

train_data = data[data.index.year >= 2023]
test_data = data[data.index.year == 2024]

X_training = train_data[['Brent Spot Price', 'Inflation', 'USD/EUR']]
y_training = train_data['BioLNG Price']
X_testing = test_data[['Brent Spot Price', 'Inflation', 'USD/EUR']]
y_testing = test_data['BioLNG Price']

linear_model = LinearRegression().fit(X_training, y_training)
y_pred_linear = linear_model.predict(X_testing)
rmse_linear = np.sqrt(mean_squared_error(y_testing, y_pred_linear))
mape_linear = np.mean(np.abs((y_testing - y_pred_linear) / y_testing)) * 100

residuals_linear = y_testing - y_pred_linear
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_testing, residuals_linear, test_size=0.2, random_state=42)
xgboost_model_linear = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=3, min_child_weight=5, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgboost_model_linear.fit(X_train_split, y_train_split)
y_pred_linear_xgb = y_pred_linear + xgboost_model_linear.predict(X_testing)
rmse_linear_xgb = np.sqrt(mean_squared_error(y_testing, y_pred_linear_xgb))
mape_linear_xgb = np.mean(np.abs((y_testing - y_pred_linear_xgb) / y_testing)) * 100

sarimax_model_residuals = SARIMAX(residuals_linear, order=(2, 0, 2))
sarimax_residuals_results = sarimax_model_residuals.fit(disp=False)
y_pred_linear_sarimax = y_pred_linear + sarimax_residuals_results.predict(start=X_testing.index[0], end=X_testing.index[-1])
rmse_linear_sarimax = np.sqrt(mean_squared_error(y_testing, y_pred_linear_sarimax))
mape_linear_sarimax = np.mean(np.abs((y_testing - y_pred_linear_sarimax) / y_testing)) * 100

arimax_model = SARIMAX(y_training, exog=X_training, order=(0, 2, 1), seasonal_order=(1, 0, 0, 12))
arimax_results = arimax_model.fit(disp=False)
y_p_arimax = arimax_results.predict(start=X_testing.index[0], end=X_testing.index[-1], exog=X_testing)
rmse_arimax = np.sqrt(mean_squared_error(y_testing, y_p_arimax))
mape_arimax = np.mean(np.abs((y_testing - y_p_arimax) / y_testing)) * 100

"""
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
auto_arima_model = auto_arima(
    y_training,
    seasonal=True,
    m=12,
    stepwise=True,
    trace=True
)

print(auto_arima_model.summary())
auto_arima(residuals_linear).summary()
adf_test = adfuller(y_training.dropna())
print(f"ADF Test Statistic: {adf_test[0]}, P-value: {adf_test[1]}")
"""


plt.figure(figsize=(14, 6))
plt.plot(biolng_prices[biolng_prices.index.year >= 2024], label="Real BioLNG Price [€/kg]", color='blue')
plt.plot(test_data.index, y_pred_linear, label="Linear", color='orange', linestyle='--')
plt.plot(test_data.index, y_p_arimax, label="ARIMAX", color='red', linestyle='--')
plt.plot(test_data.index, y_pred_linear_xgb, label="Linear + XGBoost", color='purple', linestyle='--')
plt.plot(test_data.index, y_pred_linear_sarimax, label="Linear + SARIMAX", color='pink', linestyle='--')

plt.xlabel("Date")
plt.ylabel("BioLNG Price [€/kg]")
plt.title("Prediction Comparison")
plt.legend()
plt.show()

error_analysing = {
    'Model': ['ARIMAX', 'Linear', 'Linear + XGBoost', 'Linear + SARIMAX'],
    'MAPE [%]': [mape_arimax, mape_linear, mape_linear_xgb, mape_linear_sarimax],
    'RMSE': [rmse_arimax, rmse_linear, rmse_linear_xgb, rmse_linear_sarimax]
}
errores = pd.DataFrame(error_analysing)
print(errores)

"""
#When we get the method that has the least MAE, in this case Lineal + SARIMAX
#We get to provide some future values to the exogenous variables

dataf = pd.read_csv("multivariate_biolng.csv", parse_dates=['Date'])
dataf.set_index('Date', inplace=True)
dataf = dataf[dataf.index >= '2024-01-01']
brent_series = dataf['Brent Spot Price']
ipc_series = dataf['Variation']

from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
auto_arima_model = auto_arima(
    ipc_series,
    seasonal=True,
    m=12,
    stepwise=True,
    trace=True
)
print(auto_arima_model.summary())
auto_arima(residuals_linear).summary()
adf_test = adfuller(y_training.dropna())
print(f"ADF Test Statistic: {adf_test[0]}, P-value: {adf_test[1]}")
auto_arima(brent_series).summary()
auto_arima(ipc_series).summary()


sarimax_results_brent = SARIMAX(brent_series, order=(0, 1, 0)).fit(disp=False)
sarimax_results_ipc = SARIMAX(ipc_series, order=(0, 1, 1), seasonal_order=(1, 0, 0, 12)).fit(disp=False)

future_steps = 12
future_index = pd.date_range(start=dataf.index[-1] + pd.DateOffset(months=1), periods=future_steps, freq='MS')
brent_forecast = sarimax_results_brent.predict(start=future_index[0], end=future_index[-1])
ipc_forecast = sarimax_results_ipc.predict(start=future_index[0], end=future_index[-1])

future_var = pd.DataFrame({
    'Brent Spot Price': brent_forecast,
    'Variation': ipc_forecast
}, index=future_index)

X_training = dataf[['Brent Spot Price', 'Variation']]
y_training = dataf['BioLNG Price']
linear_model = LinearRegression().fit(X_training, y_training)
linear_predictions = linear_model.predict(future_var)

residuals = y_training - linear_model.predict(X_training)
sarimax_residuals_results = SARIMAX(residuals, order=(2, 0, 0)).fit(disp=False)

residuals_forecast = sarimax_residuals_results.predict(start=future_index[0], end=future_index[-1])
final_predictions = linear_predictions + residuals_forecast

confidence_interval = 0.05
upper_bound = final_predictions * (1 + confidence_interval)
lower_bound = final_predictions * (1 - confidence_interval)

predictions_df = pd.DataFrame({
    'Date': future_index,
    'Predicted BioLNG Price': final_predictions,
    'Upper Bound': upper_bound,
    'Lower Bound': lower_bound
})


datag = pd.read_csv("multivariate_biolng.csv", parse_dates=['Date'])
datag.set_index('Date', inplace=True)
datag = datag[datag.index >= '2023-07-01']

plt.figure(figsize=(14, 6))
plt.plot(datag['BioLNG Price'], label="Real BioLNG Price", color="blue")
plt.plot(future_index, final_predictions, label="Linear + SARIMAX Forecast", color="red", linestyle="--")
plt.scatter(future_index, final_predictions, color='blue', zorder=5)
plt.fill_between(future_index, lower_bound, upper_bound, color='orange', alpha=0.3)
plt.xlabel("Date")
plt.ylabel("BioLNG Price [€/kg]")
plt.title("BioLNG Price Prediction (Linear + SARIMAX)")
plt.legend()
plt.show()

"""
